%%% MATLAB script TRFragCalc %%%
% @ Richard L. Hahnke, 2013
% Last updated: R.L. Hahnke, 20 August 2013

%{
TRFragCalc M-file - in silico prediction of terminal restriction fragments

The script TRFragCalc was written in MATLAB (v.2.9.0.529 R2009b) to 
perform in silico a terminal restriction fragment length polymorphism 
(T-RFLP) analysis of 16S rRNA gene sequences. For more information on 
T-RFLP, please read Liu et al. (1997). The sript provides an interactive 
procedure for a step by step analysis.

Before starting TRFragCalc (Step 0) the 16S rRNA gene sequences of the 
SILVA database need to be prepared in ARB (Ludwig et al. 2004). 
Export all 16S rRNA gene sequences (which are of interest) that match 
with 0 to 2 mismatches the T-RFLP forward primer (e.g. 8-27F) and reverse 
primer (e.g. 907R). Truncate the sequences to the length specified by the 
forward and reverse primer using the editor in ARB.

The export from ARB and import into MATLAB of 16S rRNA gene sequences is 
via xml files. The data in the xml files are converted into a MATLAB 
structure formate (Step 1). Each xml file will be treated within the 
import procedure as seperated branch which allows during the analysis to 
differentiate between bacterial groups (e.g. Alphaproteobacteria vs. 
Gammaproteobacteria).

To ensure that the results of the in silico digestion are comparable 
between the sequences TRFRagCalc prepares the sequences (Step 2) by (i) 
excluding sequences with ambiguities, (ii) excluding sequences that do 
not match the forward and reverse primer, and (iii) removing the 5' and 3' 
overhang.

For in silico T-RFLP analysis (Step 3) the skript provides the 
opportunities to determine (i) the length of the 16S rRNA gene sequences, 
(ii) the length of the in silico terminal restriction fragments (iTRF), 
and (iii) the G+C and A+G content of each sequence.

Finally (Step 4), the sript enables the possibility to search all 16S rRNA 
gene sequences that belong to a iTRF of specified length, or to determine 
the T-RFLP pattern of a bacterial group.

Step by step
0. Step - Preparation of 16S rRNA sequences from ARB

1. Step - load ARB data
choose whether you would like to
alternative A load ARB data from ONE XML file
alternative B load ARB data from many XML files
alternative C read exsisting MAT file

2.  Step - Prepare the data for in silico T-RFLP
2.A Step - Exclude sequences with ambiguities
2.B Step - Exclude sequences that do not match the forward or reverse primer
2.C Step - Remove the 5' and 3' overhang

3.  Step - in silico T-RFLP analysis
3.A Step - Determine the length of all 16S rRNA gene sequences
3.B Step - Determine the length of in silico terminal restriction fragments (iTRF)
3.C Step - Calculate G+C and A+G content of each sequence

export to Excel (is under construction)

4.  Step - Search for a taxon of interest or a TRF of given length
4.A Step - Search species with defined TRF size
4.B Step - Search for a special group of organisms
%}

%% 0. Step - Preparation of 16S rRNA sequences from ARB
disp('0. Step - Preparation of 16S rRNA sequences from ARB')
disp('Before starting TRFragCalc, you have to prepare the data from ARB')

%%%% 0.A ARB files can be exported in XML format.
disp('0.A Step - ARB files export in XML format')

%%% !!! 1
%{
The function Read_ARB_XML of TRFragCalc uses the MATLAB function xmlread
which reads XML files. Since this function uses the limited Java heap space, you should 
export branches of the ARB tree which should be as XML file smaller than 25
MB.
%}

%%% !!! 2
%{
Reducing the amount of attributes per 16S rRNA sequence
will save calculation time and memory.
%}

%%% information within the XML file:

% root =
%{
<ARB_SEQ_EXPORT database="RHCyanobacteria1fm.arb" export_date="Wed Dec  3 12:23:47 2008">
root end=
</ARB_SEQ_EXPORT>
%}

% tree structure =
%{
<ARB_SEQ_EXPORT database="RHCyanobacteria1fm.arb" export_date="Wed Dec  3 12:23:47 2008">
    <species name="species1">
        <acc>xx1</acc>
        <ALIGNMENT name="16s">
            <data>AGAGUU ... ACGG</data>
        </ALIGNMENT>
        <nuc>863</nuc>
    </species>
    <species name="species2">
        <acc>xx2</acc>
        <ALIGNMENT name="16s">
            <data>AGAGUU ... ACGG</data>
        </ALIGNMENT>
        <nuc>863</nuc>
    </species>
</ARB_SEQ_EXPORT>
%}

% example spezies data =
%{
<species name="UncSy654">
  <acc>AY125384</acc>
  <ALIGNMENT name="16s">
   <data>AGAGUU ... ACGG</data>
  </ALIGNMENT>
  <author>Zubkov M.V.; Fuchs B.M.; </author>
  <clone>A315024</clone>
  <date>2003-02-25;</date>
  <description>Uncultured Synechococcus sp. clone A315024 16S ribosomal RNA gene, partial sequence.</description>
  <full_name>uncultured Synechococcus sp.</full_name>
  <product>16S ribosomal RNA</product>
  <journal>Appl. Environ. Microbiol. 69:1299-1304 (2003)</journal>
  <title>High rate of uptake of organic nitrogen compounds by Prochlorococcus cyanobacteria as a key to their dominance in oligotrophic oceanic waters</title>
  <version>1</version>
  <nuc_region>1..1216</nuc_region>
  <nuc_rp>1-1216</nuc_rp>
  <submit_author>Zubkov M.V.; Fuchs B.M.; Tarran G.A.; Burkill P.H.; Amann R.; ; </submit_author>
  <submit_date>21-JUN-2002 Plymouth Marine Laboratory, PL1 3DH, Plymouth PL1 3DH, United Kingdom</submit_date>
  <align_bp_score_slv>115</align_bp_score_slv>
  <align_cutoff_head_slv>0</align_cutoff_head_slv>
  <align_cutoff_tail_slv>0</align_cutoff_tail_slv>
  <align_family_slv>UncSyn36:0.981758 Aq9A0y06:0.981758 UncSyn19:0.965174 UncSyn11:0.965174 Aq9A0yy5:0.965174 Acyyyy07:0.965174 SynSp160:0.955224 SynSp136:0.955224 UncSyn12:0.955224 SynSp164:0.955224 Aq9A0yyy:0.955224 SynSp162:0.955224 UncSyn30:0.955224 SynSp155:0.955224 SynSp140:0.955224 UncSyn35:0.955224 Acyyyy02:0.955224 UncSyn32:0.955224 Aq9A0y05:0.955224 Aq9A0yy4:0.955224 SynSp135:0.955224 SynSp143:0.955224 SynSp165:0.955224 SynSp145:0.955224 Acyyyy05:0.94859 UncSyn10:0.94859 SynSp107:0.945274 Aq9A0yy6:0.945274 Acyyyy08:0.945274 Aq9A0yy0:0.945274 UncSyn14:0.945274 UncSyn28:0.945274 UncSyn20:0.945274 Aq9A0yy2:0.945274 UncSyn34:0.945274 UncSyn26:0.93864 Acyyyy03:0.93864 Acyyy003:0.935323 UncSyn13:0.935323 Aq9A0y03:0.932007 </align_family_slv>
  <align_log_slv>copied identical ARB_C81988E9.1</align_log_slv>
  <align_quality_slv>100</align_quality_slv>
  <aligned_slv>2008-09-30 19:13:41</aligned_slv>
  <ambig_slv>0</ambig_slv>
  <ann_src_slv>EMBL; RDP;</ann_src_slv>
  <pintail_slv>100</pintail_slv>
  <homop_slv>0.41</homop_slv>
  <homop_events_slv>5</homop_events_slv>
  <nuc_gene_slv>1216</nuc_gene_slv>
  <publication_doi>10.1128/AEM.69.2.1299-1304.2003</publication_doi>
  <pubmed_id>12571062</pubmed_id>
  <seq_quality_slv>84</seq_quality_slv>
  <start>1</start>
  <stop>1216</stop>
  <tax_embl>Bacteria;Cyanobacteria;Chroococcales;Synechococcus;environmental samples;</tax_embl>
  <tax_embl_name>uncultured Synechococcus sp.</tax_embl_name>
  <tax_gg>Unclassified;</tax_gg>
  <tax_gg_name>uncultured Synechococcus sp.</tax_gg_name>
  <tax_rdp>Root;Bacteria;Cyanobacteria;Cyanobacteria;Family II;GpIIa;</tax_rdp>
  <tax_rdp_name>uncultured Synechococcus sp.</tax_rdp_name>
  <tax_xref_embl>154535</tax_xref_embl>
  <vector_slv>1.23</vector_slv>
  <nuc_term>1216</nuc_term>
  <tax_slv>Bacteria/Cyanobacteria/Chroococcales/Prochlorococcus et rel./Synechococcus sp.</tax_slv>
  <tmp> </tmp>
  <nuc>863</nuc>
 </species>
%}


%%%% 0.B Delete the following information from the XML file
disp('0.B Delete the following information from the XML file')

% !!!
%{
--> delete rows in XML file
<!DOCTYPE ARB_SEQ_EXPORT SYSTEM 'arb_seq_export.dtd' [
  <!ENTITY nbsp "&#160;">
  <!ENTITY acute "&#180;">
  <!ENTITY eacute "&#233;">
  <!ENTITY apostr "&#39;">
  <!ENTITY semi "&#59;"> ]>

--> and delete in XML file
<!--There ... s a basic version of ARB_seq_export.dtd in /arb/software/arbmgg071207_32/lib/dtd
but you might need to expand it by yourself,
because the ARB-database may contain any kind of fields.-->

--> afterwards search "apostr" in XML file and set ""
%}

%% global variables

opengl software 

%%
%%%%% 1. Step - load ARB data %%%%
disp('1. Step - load ARB data')

disp('Choose between the three alternatives.')
% choose whether you would like to
% A load ARB data from ONE XML file
disp('Alternative A: load ARB data XML file by XML file.')
% B load ARB data from many XML files
disp('Alternative B: load ARB data from many XML files automatically.')
% C read exsisting MAT file
disp('Alternative C: load ARB data from exsisting MAT file.')



%% 1. Step - alternative A - load ARB data from one XML file %%%%
% (with this alternative you can manually load each XML file and
% combine them step by step)

disp('You selected: Alternative A: load ARB data XML file by XML file.')

% first XML file %%
% path to XML files
path = '../SILVA rel102/';

% give name of the XML file to be read
file = 'Proteobacteria_1.xml'
% filename = '../SILVA rel102/Proteobacteria_1.xml'
filename = strcat(path,file)

% use function read_ARB_XML to open and read the content of the XML file
ARB_branch = Read_ARB_XML(filename);

% show name of branch
disp(strcat(filename,' is loaded (',ARB_branch.Name,').'))

% show number of spezies
disp(strcat(ARB_branch.Name,' has -',num2str(length(ARB_branch.species)),'- 16S rRNA sequences.'))

ARB_tree(1) = ARB_branch;

% every next XML file %%
% give name of the XML file to be read
file = 'Proteobacteria_2.xml'

% use function read_ARB_XML to open and read the content of the XML file
ARB_branch = Read_ARB_XML(filename);

% show name of branch
disp(strcat(filename,' is loaded (',ARB_branch.Name,').'))

% show number of spezies
disp(strcat(ARB_branch.Name,' has -',num2str(length(ARB_branch.species)),'- 16S rRNA sequences.'))

n = length(ARB_tree) + 1;
ARB_tree(n) = ARB_branch;

%{
% every next XML file %%
% give name of the XML file to be read
file = 'Proteobacteria_3.xml'

% use function read_ARB_XML to open and read the content of the XML file
ARB_branch = Read_ARB_XML(filename);

% show name of branch
strcat(filename,' is loaded (',ARB_branch.Name,').')

% show number of spezies
disp(ARB_branch.Name,' has -',num2str(length(ARB_branch.species)),'- 16S rRNA sequences.')

n = length(ARB_tree) + 1;
ARB_tree(n) = ARB_branch;

%}
%% 1. Step - alternative B - read many XML files %%%
%{
strcat('You selected: Alternative B: load ARB data from many XML files automatically.')

% path to XML files
path = '../SILVA rel102/';

% give number of XML files to be read
number_XMLfiles = 2;

file = 'ARB_part_1.xml'
filename = strcat(path,file)


% use function read_ARB_XML to open and read the content of the XML file
ARB_branch = Read_ARB_XML(filename);

% show name of branch
strcat(filename,' is loaded (',ARB_branch.Name,').')

% show number of spezies
strcat(ARB_branch.Name,' has -',num2str(length(ARB_branch.species)),'- 16S rRNA sequences.')

ARB_tree(1) = ARB_branch;

for i=2:number_XMLfiles
    
    filename = strcat(path,'ARB_part_ ',num2str(i),'.xml');

    % use function read_ARB_XML to open and read the content of the XML file
    ARB_branch = Read_ARB_XML(filename);

    % show name of branch
    strcat(filename,' is loaded (',ARB_branch.Name,').')

    % show number of spezies
    strcat(ARB_branch.Name,' has -',num2str(length(ARB_branch.species)),'- 16S rRNA sequences.')

    ARB_tree(i) = ARB_branch;
end
%}


%% 1. Step - alternative C - read exsisting MAT file %%%

disp('You selected: Alternative C: load ARB data from exsisting MAT file.')

% load('ARBtree.mat')
% structs and variables
%{
structs:            arbTree
                    
string variables:   primerF
                    primerR
                    probe_sequence
%}

%% 2. Step - Prepare sequences before in silico T-RFLP
disp('2. Step - Prepare sequences before in silico T-RFLP')

%% 2.A Step - Exclude sequences with ambiguities 
%{
R	A or G
Y	C or T
S	G or C
W	A or T
K	G or T
M	A or C
B	C or G or T
D	A or G or T
H	A or C or T
V	A or C or G
N	any base
%}

disp('2.A Step - Exclude sequences with ambiguities')


% Recurse over branches and species in the branches
% using the function CountUnspecificNucleotides()
%{
unspecificNucleotides (frequency of uncspecific nucleotides)
[Ns , Rs , Ys , Ws , Ss , Ms , Ks , Hs , Bs , Vs , Ds]
%}

for iBranch = 1:length(ARB_tree)
    for iSpecies=1:length(ARB_tree(iBranch).species)
        
        [unspecificNucleotides , sumOfUnspNucleotides] = CountUnspecificNucleotides(ARB_tree(iBranch).species(iSpecies).Children(2).Data);

        % add frequencies to ARB_branch
        ARB_tree(iBranch).unspecificNucleotides(iSpecies,1:11) = unspecificNucleotides;
        ARB_tree(iBranch).unspecificNucleotidesInSequence(iSpecies) = sumOfUnspNucleotides;
    end
    
    % determine maximum and minimum frequency of umbigquities (helps for an equal
    % distribution by function hist)
    unspecificNucleotides_max(iBranch , 1:11) = max(ARB_tree(iBranch).unspecificNucleotides, [], 1);
    unspecificNucleotides_min(iBranch , 1:11) = min(ARB_tree(iBranch).unspecificNucleotides, [], 1);
    
    unspecificNucleotidesInSequence_max(iBranch) = max(ARB_tree(iBranch).unspecificNucleotidesInSequence);
    unspecificNucleotidesInSequence_min(iBranch) = min(ARB_tree(iBranch).unspecificNucleotidesInSequence);
    
    unspNTfrequency(1:11 , iBranch) = sum(ARB_tree(iBranch).unspecificNucleotides , 1);
end

% calculate histograms of sequence length
% using [n,xout] = hist(...)
% returns vectors n and xout containing the frequency counts and the bin locations

% to get an equal histogram for all branches, determine amount of steps in binX
histSteps_unspNTinSeq = [0:1:max(unspecificNucleotidesInSequence_max)];


% Recurse over branches to get a histogram of each branch seperatly
for iBranch = 1:length(ARB_tree)
    
    frequency = hist(ARB_tree(iBranch).unspecificNucleotidesInSequence , histSteps_unspNTinSeq);
    
    unspecificNucleotidesInSequenceFrequency(1:length(histSteps_unspNTinSeq),iBranch) = frequency;
      
end

% use bar(xout,n) to plot the histogram
figure
bar(histSteps_unspNTinSeq, unspecificNucleotidesInSequenceFrequency, 'stack')
%legend(ARB_tree(1).Name,ARB_tree(2).Name) %%!!!%%
xlabel('# Ambiguities per sequence')
ylabel('# Sequences')
title('Ambiguities distribution')

% distribution of ambiquity types in sequences
% unspNTfrequency(1:11, iBranch)

figure
bar(unspNTfrequency , 'stack')
set(gca,'XTickLabel',{'N', 'R', 'Y', 'W', 'S', 'M', 'K', 'H', 'B', 'V', 'D'})

title('Ambiguities within the 16S rRNA sequences')
xlabel('Ambiguities')
ylabel('# Nucleotide positions with ambiguities')


% show number of 16S rRNA sequences with ambiguities
for iBranch = 1:length(ARB_tree)
    
   disp(strcat(ARB_tree(iBranch).Name,' has 16S rRNA sequences without (',num2str(unspecificNucleotidesInSequenceFrequency(1,iBranch)),') and with (',num2str(sum(unspecificNucleotidesInSequenceFrequency(2:length(unspecificNucleotidesInSequenceFrequency),iBranch))),') ambiguities'))
     
end

%%% exclude sequences with unspecific sequences %%%
disp('Split 16S rRNA gene sequences in ARB_branch into sequences with and without ambiguities')


% Recurse over branches and species in the branches
% using find()
for iBranch = 1:length(ARB_tree)
    
    idx = find(ARB_tree(iBranch).unspecificNucleotidesInSequence == 0);      % no unspecific nucleotide in sequence
    idxUnsp = find(ARB_tree(iBranch).unspecificNucleotidesInSequence > 0);   % sequences with unspecific nucleotide
   
    
    ARB_tree(iBranch).species_withAmbiguities = ARB_tree(iBranch).species(idxUnsp);
    strcat(ARB_tree(iBranch).Name,' has -',num2str(length(ARB_tree(iBranch).species_withAmbiguities)),'- sequences with ambiguities')

    
    ARB_tree(iBranch).species = ARB_tree(iBranch).species(idx);
    strcat(ARB_tree(iBranch).Name,' has -',num2str(length(ARB_tree(iBranch).species)),'- sequences without ambiguities')
    
end


%% 2.B Step - Exclude sequences that do not match the forward or reverse primer
%{
All forward and some reverse primers should match
insertions or deletions in the primer sequence will result in an
arbitrary longer or smaller 16S rRNA gene sequence.

Additionally unspecific binding by maximal one nucleotide that were
found in the SILVA database by manually checking the sequences.
The last 8 nucleotides of the primer sequence can influence the
polymerase binding, therefor sequences with mismatches at these
nucleotides must be excluded.
%}

disp('2.B Step - Exclude sequences that do not match the forward or reverse primer')

% forward primer: 27F (FAM)
% 5'-AGA GTT TGA TY    M     TGG CTC AG-3'
%{
% AGAGTTTGATCaTGGCTCAG
% AGAGTTTGATCcTGGCTCAG
% AGAGTTTGATcATGGCTCAG
% AGAGTTTGATcCTGGCTCAG
% AGAGTTTGATtATGGCTCAG
% AGAGTTTGATtCTGGCTCAG
%}
% 5'-AGA GTT TGA T[C,T][A,C] TGG CTC AG-3'

% unspecific binding by maximal one nucleotide 
%{
%    AGa GTT TGA TC    A     TGG CTC AG
%    tGA GTT TGA TT    C     TGG CTC AG
%    AaA GTT TGA TC    A     TGG CTC AG
%    AGA GTT TGA aC    C     TGG CTC AG
%    AGA cTT TGA TC    A     TGG CTC AG
%    AGA cTT TGA TC    A     TGG CTC AG
%    AGA CTT aGA TC    t     TGG CTC AG
%    AGA GTT AGA TC    t     TGG CTC AG
%    AGAaGTT TGA TC    C     TGG CTC AG
%    AGA GTT TGA TC    C     TGG CTT AG
%}

% primer = 5'-NNN NNN NNN NNN TGG CTC AG-3'
primerF_Ns = '[A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T]TGGCTCAG';

for iBranch = 1:length(ARB_tree)
    
    numberNoMatches = 0;
    idxSequences_exclude = [];
    
    for iSpecies=1:length(ARB_tree(iBranch).species)
     
        matchIndex = [];
        matchIndex = regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data , primerF_Ns , 'start');

        if(isempty(matchIndex))
            
            numberNoMatches = numberNoMatches + 1;
            idxSequences_exclude(iBranch,numberNoMatches) = [iSpecies];
        else
  
                        
        end

    end

    matchPercent = numberNoMatches/length(ARB_tree(iBranch).species);
    disp(strcat('-', num2str(numberNoMatches), '- of -',num2str(length(ARB_tree(iBranch).species)),'- sequences (', num2str(matchPercent,2),'%) do not match the forward primer in: ', ARB_tree(iBranch).Name,'- with max 1 mismatch.'))

    % exclude these sequences from ARB_branch
    if (numberNoMatches ~= 0)
        ARB_tree(iBranch).species(idxSequences_exclude(iBranch,1:numberNoMatches)) = [];
    end
    
end

% primer = 5'-AGA GTT TGA T[C,T][A,C] TGG CTC AG-3'
primerF = 'AGAGTTTGAT[C,T][C,A]TGGCTCAG';

for iBranch = 1:length(ARB_tree)
    
    numberOfMatches = 0;
    
    for iSpecies=1:length(ARB_tree(iBranch).species)
     
        matchIndex = [];
        matchIndex = regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data , primerF , 'end');

        if(isempty(matchIndex))  
        else
            
            numberOfMatches = numberOfMatches + 1;
        end

    end

    matchPercent = numberOfMatches/length(ARB_tree(iBranch).species);
    disp(strcat('-', num2str(numberOfMatches), '- of -',num2str(length(ARB_tree(iBranch).species)),'- sequences (', num2str(matchPercent),'%) match the forward primer in: ', ARB_tree(iBranch).Name,'- without mismatches.'))

end


%{
%    AGa GTT TGA TC    A     TGG CTC AG
%    tGA GTT TGA TT    C     TGG CTC AG
%    AaA GTT TGA TC    A     TGG CTC AG
%    AGA GTT TGA aC    C     TGG CTC AG
%    AGA cTT TGA TC    A     TGG CTC AG
%    AGA cTT TGA TC    A     TGG CTC AG
%    AGA CTT aGA TC    t     TGG CTC AG
%    AGA GTT AGA TC    t     TGG CTC AG
%    AGA GTT TtA TC    C     TGG CTC AG
%    AGA GTT TGA TC    g     TGG CTC AG
%    AGA GTT gGA TC    C     TGG CTC AG
%    AGt GTT TGA TC    C     TGG CTC AG
%    AGA GTT TGA Tg    T     TGG CTC AG
%    ...
%}
primerF_1MM = '[A,T][A,G][A,G,T][C,G]TT[A,G,T][G,T]A[A,T][C,G,T][C,A,G,T]TGGCTCAG';

for iBranch = 1:length(ARB_tree)
    
    numberOfMatches = 0;
    %numberNoMatches = 0;
    
    for iSpecies=1:length(ARB_tree(iBranch).species)
     
        matchIndex = [];
        matchIndex = regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data , primerF_1MM , 'end');

        if(isempty(matchIndex))
            
            %numberNoMatches = numberNoMatches + 1;
            %y(iBranch,numberNoMatches) = [iSpecies];
            
        else
            
            numberOfMatches = numberOfMatches + 1;
            x(iBranch,numberOfMatches) = [iSpecies];
        end

    end

    matchPercent = numberOfMatches/length(ARB_tree(iBranch).species);
    disp(strcat('-', num2str(numberOfMatches), '- of -',num2str(length(ARB_tree(iBranch).species)),'- sequences (', num2str(matchPercent),'%) match the forward primer in: ', ARB_tree(iBranch).Name,'- with max 1 mismatch.'))

end


% reverse primer 907R (HEX)
% 5'-CCG TCA ATT CCT TTR AGT TT-3'
% reverse complement
% 3'-AAA CTY AAA GGA ATT GAC GG-5'

% 3'-AAA CT[C,T] AAA GGA ATT GAC GG-5'

% unspecific binding by maximal one nucleotide
primerR_Ns = 'AAACT[C,T]AA[A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T][A,G,C,T]'

for iBranch = 1:length(ARB_tree)
    
    numberNoMatches = 0;
    numberOfMatches = 0;
    idxSequences_exclude = [];
    
    for iSpecies=1:length(ARB_tree(iBranch).species)
     
        matchIndex = [];
        matchIndex = regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data , primerR_Ns , 'end');

        if(isempty(matchIndex))
            
            numberNoMatches = numberNoMatches + 1;
            idxSequences_exclude(iBranch,numberNoMatches) = [iSpecies];
        else
            numberOfMatches = numberOfMatches + 1;
        end

    end

    matchPercent = numberNoMatches/length(ARB_tree(iBranch).species);
    disp(strcat('-', num2str(numberNoMatches), '- of -',num2str(length(ARB_tree(iBranch).species)),'- sequences (', num2str(matchPercent,2),'%) do not match the reverse primer in: ', ARB_tree(iBranch).Name,'- with max 1 mismatch.'))

    % exclude these sequences from ARB_branch   
    if (numberNoMatches ~= 0)
        ARB_tree(iBranch).species(idxSequences_exclude(iBranch,1:numberNoMatches)) = [];
    end
end


primerR = 'AAACT[C,T]AAAGGAATTGACGG';

for iBranch = 1:length(ARB_tree)
    
    numberOfMatches = 0;
    
    for iSpecies=1:length(ARB_tree(iBranch).species)
     
        matchIndex = regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data , primerR , 'end');

        if(isempty(matchIndex))
        else
            numberOfMatches = numberOfMatches + 1;
        end

    end

    matchPercent = numberOfMatches/length(ARB_tree(iBranch).species);
    disp(strcat('-', num2str(numberOfMatches), '- of -',num2str(length(ARB_tree(iBranch).species)),'- sequences (', num2str(matchPercent),'%) match the reverse primer in: ', ARB_tree(iBranch).Name))

end


%{
% AAA CTC AAA GGA ATT GAC GG
% AAA CTC AAA tGA ATT GAC GG
% AAA CTC AAA GGA ATa GAC GG
% AAA CTC AAA GGA gTT GAC GG
%}
primerR_1MM = 'AAACT[C,T]AAA[G,T]GA[A,G]T[A,T]GACGG';

for iBranch = 1:length(ARB_tree)
    
    numberOfMatches = 0;
    numberNoMatches = 0;
    
    for iSpecies=1:length(ARB_tree(iBranch).species)
     
        matchIndex = [];
        matchIndex = regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data , primerR_1MM , 'end');

        if(isempty(matchIndex))
            
            numberNoMatches = numberNoMatches + 1;
            %y(iBranch,numberNoMatches) = [iSpecies];
            
        else
            
            numberOfMatches = numberOfMatches + 1;
            %x(iBranch,numberOfMatches) = [iSpecies];
        end

    end

    matchPercent = numberOfMatches/length(ARB_tree(iBranch).species);
    disp(strcat('-', num2str(numberOfMatches), '- of -',num2str(length(ARB_tree(iBranch).species)),'- sequences (', num2str(matchPercent),'%) match the reverse primer in: ', ARB_tree(iBranch).Name,'- with max 1 mismatch.'))

end



%% 2.C Step - Remove the 5' and 3' overhang
%{
Sequences that have nucleotides outside of the amplified 16S rRNa gene
sequence make the TRFs arbitrary longer and thus in silico digestion will
result in longer TRFs.
NNN<forward primer> ACCTGG...TTCGA <reverse primer> NNN
%}
disp('2.C Step - Remove the 5´ and 3´ overhang')

% determine Ns in 5'- 'NNN<primerF>...<primerR>NN' -3'
for iBranch = 1:length(ARB_tree)
    for iSpecies=1:length(ARB_tree(iBranch).species)
    
        matchIndex = [];
        matchIndex = regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data , primerF_Ns , 'start');
        
        % determine overhangs at the 5' end of the sequence
        ARB_tree(iBranch).startOverhang(iSpecies) = matchIndex(1) - 1;
        
        
        matchIndex = [];
        matchIndex = regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data , primerR_Ns , 'end');
        % determine overhangs at the 3' end of the sequence
        ARB_tree(iBranch).endOverhang(iSpecies) = length(ARB_tree(iBranch).species(iSpecies).Children(2).Data) - matchIndex(length(matchIndex));
    end
end

% start to remove the overhang from the end of the sequence, to do not
% change the index
for iBranch = 1:length(ARB_tree) 
    % if the reverse primer does not end with the last nucleotide, than
    % the sequence has an overhang at the 3' site of the primer
    idx_overhang = find(ARB_tree(iBranch).endOverhang ~= 0);
    
    disp(strcat('-', num2str(length(idx_overhang)),'- sequences have an 3´ overhang in -', ARB_tree(iBranch).Name,'-'))    
    
    if (isvector(idx_overhang))
    % shorten the sequences, end the sequence with the primer end
    % recurse over sequences with overhang only
    for iIdx = 1: length(idx_overhang)
        
        % determine length of overhang
        overhang = ARB_tree(iBranch).endOverhang(idx_overhang(iIdx));
        
        % show sequence
        acc_sequence = ARB_tree(iBranch).species(idx_overhang(iIdx)).Children(1).Data
        ARB_tree(iBranch).species(idx_overhang(iIdx)).Children(2).Data
        
        % remove the overhanging nucleotides from the sequence
        ARB_tree(iBranch).species(idx_overhang(iIdx)).Children(2).Data(1:overhang) = [];
        
        % show result
        disp(strcat('From acc -',acc_sequence,'- of -', ARB_tree(iBranch).Name,'- -',num2str(overhang),'- nucleotide(s) were removed.'))
        ARB_tree(iBranch).species(idx_overhang(iIdx)).Children(2).Data
        
    end
    end
end
 

for iBranch = 1:length(ARB_tree)
        
    % if the forward primer does not start with the first nucleotide, than
    % the sequence has an overhang at the 5' site of the primer
    idx_overhang = find(ARB_tree(iBranch).startOverhang ~= 0);
    
    disp(strcat('-', num2str(length(idx_overhang)),'- sequences have an 5´ overhang in -', ARB_tree(iBranch).Name,'-'))
    
    if (isvector(idx_overhang))
    % shorten the sequences, start the sequence with the primer start
    % recurse over sequences with overhang only
    for iIdx = 1: length(idx_overhang)
        
        % determine length of overhang
        overhang = ARB_tree(iBranch).startOverhang(idx_overhang(iIdx));
        
        % show sequence
        acc_sequence = ARB_tree(iBranch).species(idx_overhang(iIdx)).Children(1).Data;
        
        disp(acc_sequence)
        disp(ARB_tree(iBranch).species(idx_overhang(iIdx)).Children(2).Data)
        
        % remove the overhanging nucleotides from the sequence
        ARB_tree(iBranch).species(idx_overhang(iIdx)).Children(2).Data(1:overhang) = [];
        
        % show result
        disp(strcat('From acc -',acc_sequence,'- of -', ARB_tree(iBranch).Name,'- -',num2str(overhang),'- nucleotide()s were removed.'))
        disp(ARB_tree(iBranch).species(idx_overhang(iIdx)).Children(2).Data)
        
    end
    end
end   
    


%% 3.A Step - Determine the length of all 16S rRNA gene sequences
disp('3.A Step - Determine the length of all 16S rRNA gene sequences')

% Recurse over branches and aequences in branches
% 16S rRNA length = length('characters in sequence')

% branch specific distribution of sequence lengths

for iBranch=1:length(ARB_tree)
    
    % determine sequence length
    for iSpecies=1:length(ARB_tree(iBranch).species)
        ARB_tree(iBranch).sequenceLength(iSpecies) = length(ARB_tree(iBranch).species(iSpecies).Children(2).Data);
    end
    
    % determine maximum and minimum sequence length (helps for an equal
    % distribution by function hist)
    sequenceLength_max(iBranch) = max(ARB_tree(iBranch).sequenceLength);
    sequenceLength_min(iBranch) = min(ARB_tree(iBranch).sequenceLength);
    
end

% calculate histograms of sequence length
% using [n,xout] = hist(...)
% returns vectors n and xout containing the frequency counts and the bin locations

% to get an equal histogram for all branches, determine amount of steps in binX
histSteps = [min(sequenceLength_min):1:max(sequenceLength_max)];

% Recurse over branches to get a histogram of each branch seperatly
for iBranch = 1:length(ARB_tree)
    frequency = hist(ARB_tree(iBranch).sequenceLength,histSteps);
    
    lengthFrequency(1:length(histSteps),iBranch) = frequency;
    
    % automatic table of branch names for plot legend
    %branche_legend(iBranch,1:length(ARB_tree(iBranch).Name)) = ARB_tree(iBranch).Name
end

%branche_legend = branche_legend';

% use bar(xout,n) to plot the histogram
figure
bar(histSteps, lengthFrequency, 'stack')
%legend(ARB_tree(1).Name,ARB_tree(2).Name) %%!!!%%
xlabel('Sequence length (nt)')
ylabel('# Sequences')
title('Sequence length distribution')




%% 3.B Step - Determine the length of in silico terminal restriction fragments (iTRF)
%{
%%% struct of ARB_tree

%"acc"
ARB_tree(iBranch).species(iSpecies).Children(1).Name = 'acc'
ARB_tree(iBranch).species(iSpecies).Children(1).Data

%"sequence"
ARB_tree(iBranch).species(iSpecies).Children(2).Name = 'sequence'
ARB_tree(iBranch).species(iSpecies).Children(2).Data

%"full_name"
ARB_tree(iBranch).species(iSpecies).Children(3).Name = 'full_name'
ARB_tree(iBranch).species(iSpecies).Children(3).Data

%"discription"
ARB_tree(iBranch).species(iSpecies).Children(4).Name = 'discription'
ARB_tree(iBranch).species(iSpecies).Children(4).Data
%}

% using regular expressions
%{
% regexp, regexpi 
Match regular expression

Syntax
regexp('str', 'expr')
[start_idx, end_idx, extents, matches, tokens, names, splits] = regexp('str', 'expr')
[v1, v2, ...] = regexp('str', 'expr', q1,q2, ...)
[v1 v2 ...] = regexp('str', 'expr', ..., options)
%}

disp('3.B Step - Determine the length of in silico terminal restriction fragments (iTRF)')

% restriction recognition site of restriction enzyme AluI
RRS = 'AGCT';
disp(strcat('TRFragCalc uses the restriction recognition site -', RRS, '- to digest the 16S rRNA gene sequence.'))

% Recurse over branches and species in the branches for in silico digestion
% [RRS_index , numberOfRRS , fullDigest_forward_iTRF , fullDigest_reverse_iTRF] = InSilicoDigestion(RRS , sequence)
for iBranch = 1:length(ARB_tree)
    for iSpecies = 1:length(ARB_tree(iBranch).species)
        
        % use function InSilicoDigestion()
        [RRS_index , numberOfRRS , fullDigest_forward_iTRF , fullDigest_reverse_iTRF] = InSilicoDigestion(RRS , ARB_tree(iBranch).species(iSpecies).Children(2).Data);
    
        % store the number of RRS for each seuqnce
        ARB_tree(iBranch).numberOfRRS(iSpecies) = numberOfRRS;
        
        % store forward iTRFs
        ARB_tree(iBranch).fullDigest_forward_iTRF(iSpecies) = fullDigest_forward_iTRF;
        
        % store reverse iTRFs
        ARB_tree(iBranch).fullDigest_reverse_iTRF(iSpecies) = fullDigest_reverse_iTRF;
    
    end
    
    % determine maximum and minimum numberOfRRS (helps for an equal
    % distribution by function hist)
    numberOfRRS_max(iBranch) = max(ARB_tree(iBranch).numberOfRRS);
    numberOfRRS_min(iBranch) = min(ARB_tree(iBranch).numberOfRRS);
    
    % determine maximum and minimum fullDigest_forward_iTRF (helps for an equal
    % distribution by function hist)
    fullDigest_forward_iTRF_max(iBranch) = max(ARB_tree(iBranch).fullDigest_forward_iTRF);
    fullDigest_forward_iTRF_min(iBranch) = min(ARB_tree(iBranch).fullDigest_forward_iTRF);
    
    % determine maximum and minimum fullDigest_reverse_iTRF (helps for an equal
    % distribution by function hist)
    fullDigest_reverse_iTRF_max(iBranch) = max(ARB_tree(iBranch).fullDigest_reverse_iTRF);
    fullDigest_reverse_iTRF_min(iBranch) = min(ARB_tree(iBranch).fullDigest_reverse_iTRF);
end

% calculate histograms
% using [n,xout] = hist(...)
% returns vectors n and xout containing the frequency counts and the bin locations

% to get an equal histogram for all branches, determine amount of steps in binX
histSteps_RRS = [min(numberOfRRS_min):1:max(numberOfRRS_max)];
histSteps_fTRF = [min(fullDigest_forward_iTRF_min):1:max(fullDigest_forward_iTRF_max)];
histSteps_rTRF = [min(fullDigest_reverse_iTRF_min):1:max(fullDigest_reverse_iTRF_max)];

% Recurse over branches to get a histogram of each branch seperatly
for iBranch = 1:length(ARB_tree)
    
    % histogram of RRS number
    frequency = hist(ARB_tree(iBranch).numberOfRRS , histSteps_RRS);
    RRS_frequency(1:length(histSteps_RRS),iBranch) = frequency;
    
    % histogram of forward iTRF
    frequency = hist(ARB_tree(iBranch).fullDigest_forward_iTRF , histSteps_fTRF);
    forward_iTRF_frequency(1:length(histSteps_fTRF),iBranch) = frequency;
    
    % histogram of reverse iTRF
    frequency = hist(ARB_tree(iBranch).fullDigest_reverse_iTRF , histSteps_rTRF);
    reverse_iTRF_frequency(1:length(histSteps_rTRF),iBranch) = frequency;
    
end


% use bar(xout,n) to plot the histogram

% RRS distribution
figure
bar(histSteps_RRS, RRS_frequency, 'stack')
%legend(ARB_tree(1).Name,ARB_tree(2).Name) %%!!!%%
xlabel('Number of RRS in 16S rRNA sequence')
ylabel('# Sequences')
title('Distribution of restriction recognition sites')

% t-RFLP pattern forward
figure
bar(histSteps_fTRF, forward_iTRF_frequency, 'stack')
%legend(ARB_tree(1).Name,ARB_tree(2).Name) %%!!!%%
xlabel('Sequence length (nt)')
ylabel('# Sequences')
title('Distribution of forward fragments of full digest')

% t-RFLP pattern reverse
figure
bar(histSteps_rTRF, reverse_iTRF_frequency, 'stack')
%legend(ARB_tree(1).Name,ARB_tree(2).Name) %%!!!%%
xlabel('Sequence length (nt)')
ylabel('# Sequences')
title('Distribution of reverse fragments of full digest')



%% 3.C Step - Calculate G+C and A+G content of each sequence
%{
It was frequently shown that the difference in the A+G content can
influence the sizer determination in the T-RFLP analysis, due to a
difference in migration within the capillary electrophoresis. Havier
sequences with a larger portion of adenin (AMP, M = 312.19 g/mol) and 
guanin (GMP, M = 328.19 g/mol) might migrate slower that sequences with 
larger proportions of tymidin (TMP, M = 303.18 g/mol) and cytosine
(CMP, M = 288.17 g/mol).
%}
disp('3.C Step - Calculate G+C and A+G content of each sequence')


for iBranch = 1:length(ARB_tree)
    
    nucleotideFequency = zeros(length(ARB_tree(iBranch).species),4);
    
    for iSpecies=1:length(ARB_tree(iBranch).species)
        
        % determine AMPs
        nucleotideFequency(iSpecies,1) = length(regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data,'A'));
        
        % determine GMPs
        nucleotideFequency(iSpecies,2) = length(regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data,'G'));
        
        % determine CMPs
        nucleotideFequency(iSpecies,3) = length(regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data,'C'));
        
        % determine TMPs
        nucleotideFequency(iSpecies,4) = length(regexpi(ARB_tree(iBranch).species(iSpecies).Children(2).Data,'T'));
    
        % G+C content (%)
        ARB_tree(iBranch).GC_content(iSpecies) = 100 * (nucleotideFequency(iSpecies,2) + nucleotideFequency(iSpecies,3)) / length(ARB_tree(iBranch).species(iSpecies).Children(2).Data);
    
         % A+G content (%)
        ARB_tree(iBranch).AG_content(iSpecies) = 100 * (nucleotideFequency(iSpecies,1) + nucleotideFequency(iSpecies,2)) / length(ARB_tree(iBranch).species(iSpecies).Children(2).Data);
    
    end
    
    % determine maximum and minimum (helps for an equal
    % distribution by function hist)
    GC_content_max(iBranch) = max(ARB_tree(iBranch).GC_content);
    GC_content_min(iBranch) = min(ARB_tree(iBranch).GC_content);
    
    AG_content_max(iBranch) = max(ARB_tree(iBranch).AG_content);
    AG_content_min(iBranch) = min(ARB_tree(iBranch).AG_content);
    
end

% calculate histograms
% using [n,xout] = hist(...)
% returns vectors n and xout containing the frequency counts and the bin locations

% to get an equal histogram for all branches, determine amount of steps in binX
histSteps_GCc = [min(GC_content_min):1:max(GC_content_max)];
histSteps_AGc = [min(AG_content_min):1:max(AG_content_max)];


% Recurse over branches to get a histogram of each branch seperatly
for iBranch = 1:length(ARB_tree)
    
    % histogram of G+C content
    frequency = hist(ARB_tree(iBranch).GC_content , histSteps_GCc);
    GCc_frequency(1:length(histSteps_GCc),iBranch) = frequency;
    
    % histogram of A+G content
    frequency = hist(ARB_tree(iBranch).AG_content , histSteps_AGc);
    AGc_frequency(1:length(histSteps_AGc),iBranch) = frequency;
    
end

% use bar(xout,n) to plot the histogram

% G+C content distribution
figure
bar(histSteps_GCc, GCc_frequency, 'stack')
%legend(ARB_tree(1).Name,ARB_tree(2).Name) %%!!!%%
xlabel('G+C content (%)')
ylabel('# Sequences')
title('Distribution of G+C content')

% A+G content distribution
figure
bar(histSteps_AGc, AGc_frequency, 'stack')
%legend(ARB_tree(1).Name,ARB_tree(2).Name) %%!!!%%
xlabel('A+G content (%)')
ylabel('# Sequences')
title('Distribution of A+G content')


%% - export to Excel 
%{
in progress
%}

%% 4.A Step - Search species with defined TRF size
disp('4.A Step - Search species with defined TRF size')

% enter the TRF length of interest
% either enter one length (e.g. TRFsize = 270;)
% under construction: 
% a vector of TRF lengths (e.g. TRFsize = [269 , 270];)
TRFsize = 600;

for iBranch = 1:length(ARB_tree)    
    
    %[TRF_idx] = FindSpeciesOfTRFsize(TRFsize, ARB_tree(iBranch));
    IDX = find(ARB_tree(iBranch).fullDigest_forward_iTRF == TRFsize);
    
    if (isvector(IDX))
        
        disp(strcat(ARB_tree(iBranch).Name,'- has -',num2str(length(IDX)),'- sequences with the TRF length of -',num2str(TRFsize),'- nucleotides.'))

        TRFs_name = {};
        TRF_acc = {};
        TRF_tax = {};
        
        % prepare for export
        for iIDX = 1:length(IDX) 
            
            TRFs_name(iIDX,1:length(ARB_tree(iBranch).species(IDX(iIDX)).Children(3).Data)) = ARB_tree(iBranch).species(IDX(iIDX)).Children(3).Data;
            TRF_acc(iIDX,1:length(ARB_tree(iBranch).species(IDX(iIDX)).Children(1).Data)) = ARB_tree(iBranch).species(IDX(iIDX)).Children(1).Data;
            TRF_tax(iIDX,1:length(ARB_tree(iBranch).species(IDX(iIDX)).Children(5).Data)) = ARB_tree(iBranch).species(IDX(iIDX)).Children(5).Data;
            
        end
        
        disp(strcat(ARB_tree(iBranch).Name,'- has -',num2str(length(IDX)),'- sequences with the TRF length of -',num2str(TRFsize),'- nucleotides.'))

        
    else
        
       disp(strcat(ARB_tree(iBranch).Name,'- has -0- sequences with the TRF length of -',num2str(TRFsize),'- nucleotides.'))
  
    end
    
end

